package storm.xlog;
import storm.kafka.*;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.StormTopology;
import backtype.storm.spout.SchemeAsMultiScheme;
import backtype.storm.topology.BasicOutputCollector;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.topology.base.BaseBasicBolt;
import backtype.storm.task.TopologyContext;
import backtype.storm.task.OutputCollector;
import backtype.storm.tuple.Tuple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.io.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TimeZone;
import java.util.Locale;
import java.util.Date;
import java.util.Hashtable;
import java.util.HashMap;
import java.util.Map;
import java.util.Enumeration;
import java.text.ParseException;
import java.util.Vector;
import java.util.Properties;
import java.lang.Long.*;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import org.apache.commons.lang.StringUtils;
public class XlogKafkaSpoutTopology {
public static final Logger LOG = LoggerFactory.getLogger(XlogKafkaSpoutTopology.class);
public static class XlogBolt extends BaseBasicBolt {
private long intervalTime = 60;
private long totalThreshold = 30;
private long scopeThreshold = 10;
private String topic = "";
private String mysqlUrl = "";
private String mysqlUser = "";
private String mysqlPassword = "";
private boolean isStatic = true;
private int start_datetime = 0;
private int start_mm = 0;
private int total = 0,statics = 0, dynamics = 0;
Hashtable<String, Object> hashIp = new Hashtable<String, Object>(1000, 0.5F);
Hashtable<String, Object> hashIpUrl = new Hashtable<String, Object>(1000, 0.5F);
HashMap<String, Integer> ipWhitelist = new HashMap<String, Integer>();
public void prepare(Map stormConf, TopologyContext context) {
topic = (String) stormConf.get("xlog.kafka.topic.name");
totalThreshold = Long.parseLong((String) stormConf.get("insert.into.mysql.min.total"), 10);
scopeThreshold = Long.parseLong((String) stormConf.get("insert.into.mysql.max.scope"), 10);
intervalTime = Long.parseLong((String) stormConf.get("xlog.interval.time"), 10);
mysqlUrl = (String) stormConf.get("mysql.url");
mysqlUser = (String) stormConf.get("mysql.user");
mysqlPassword = (String) stormConf.get("mysql.password");
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
@Override
public void execute(Tuple tuple, BasicOutputCollector collector) {
String line = tuple.getString(0);
String regex = "([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3})\\s(.+)\\s\\-\\s\\[(.+)\\s\\+0800\\]\\s\"(.+)\\s(.+)\\s(.+)\"\\s(\\d+)\\s(\\d+)\\s\"(.+)\"\\s\"(.+)\"\\s\"(.+)\"\\s(\\d+\\.\\d+|\\d+|\\-)\\s(\\d+\\.\\d+|\\d+|\\-)";
String ip = "";
String host = "-";
String datetime = "";
String method = "";
String url = "";
String code = "";
String size = "";
int re_time = 0;
int mm = 0;
boolean inWhitelist = false;
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(line);
while (matcher.find()) {
ip = matcher.group(1);//只取第一组
host = matcher.group(2);//只取第一组
datetime = matcher.group(3);
method = matcher.group(4);
url = matcher.group(5);
code = matcher.group(7);
size = matcher.group(8);
total ++;
//从日志标记提取时间
TimeZone.setDefault(TimeZone.getTimeZone("GMT+8:00"));
SimpleDateFormat sdf = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US);
Date d;
try {
d = sdf.parse(datetime);
long l = d.getTime();
String str = String.valueOf(l);
re_time = Integer.parseInt(str.substring(0, 10));
} catch (ParseException e) {
e.printStackTrace();
}
String re_StrTime = null;
SimpleDateFormat sdf_1 = new SimpleDateFormat("yyyyMMddHHmm", Locale.CHINA);
SimpleDateFormat sdf_mm = new SimpleDateFormat("mm", Locale.CHINA);
long lcc_time = Long.valueOf(re_time);
re_StrTime = sdf_1.format(new Date(lcc_time * 1000L));
mm = Integer.parseInt(sdf_mm.format(new Date(lcc_time * 1000L)));
//开始时间标记
if ( start_datetime == 0) {
start_datetime = re_time;
start_mm = mm;
}
//已经在白名单的不再继续统计
if (ip.equals('-')) {
continue;
}
inWhitelist = ipWhitelist.containsKey(ip);
if ( inWhitelist ) {
continue;
}
//访问广度是否达到白名单阀值,是的话添加到白名单列表,并清空hashIpUrl记录以节省内存
boolean urlMapIsExist = hashIpUrl.containsKey(ip);
HashMap<String, Integer> mapUrl = null;
mapUrl = urlMapIsExist ? (HashMap<String, Integer>) hashIpUrl.get(ip) : new HashMap<String, Integer>();
Integer mapUrlSize = mapUrl.size();
if ( mapUrlSize >= scopeThreshold ) {
hashIpUrl.remove(ip);
ipWhitelist.put(ip, 1);
inWhitelist = true;
}
//是否为静态记录
if (url.toLowerCase().matches(".+(\\.jpg|\\.png|\\.js|\\.gif|\\.css|\\.ico|\\.swf|\\.jpeg|\\.txt|\\.html|\\.htm){1}.*")) {
statics++;
isStatic = true;
} else {
dynamics++;
isStatic = false;
}
if ( !inWhitelist ) {
String newUrl = "";
if (isStatic) {
newUrl = url;
} else {
String[] urlArr = StringUtils.split(url,"/");
Integer count = urlArr.length;
if (count >= 3) {
newUrl = "/"+urlArr[0]+"/"+urlArr[1];
} else if (count <= 0) {
newUrl = "/";
} else {
//Integer indexof1 = url.indexOf('.');
Integer indexof2 = url.indexOf('?');
Integer indexof3 = url.indexOf('=');
if (indexof3 != -1 && indexof2 != -1) {
newUrl = url.substring(0, indexof3);
} else {
newUrl = urlArr[0];
}
}
}
mapUrl.put(newUrl, 1);
hashIpUrl.put(ip, mapUrl);
}
HashMap<String, Integer> map = null;
boolean ipMapIsExist = hashIp.containsKey(ip);
map = ipMapIsExist ? (HashMap<String, Integer>) hashIp.get(ip) : new HashMap<String, Integer>();
if (inWhitelist && ipMapIsExist) {
hashIp.remove(ip);
continue;
}
Integer codeFirst = Integer.parseInt(code.substring(0, 1));
String fieldName = "";
if ( ipMapIsExist ) {
map.put("total", map.get("total") + 1);
if ( isStatic ) {
map.put("statics", map.get("statics") + 1);
} else {
map.put("dynamics", map.get("dynamics") + 1);
}
if (method.equals("GET")) {
map.put("get", map.get("get") + 1);
} else if (method.equals("POST")) {
map.put("post", map.get("post") + 1);
} else if (method.equals("HEAD")) {
map.put("head", map.get("head") + 1);
} else {
map.put("other", map.get("other") + 1);
}
for(int c = 2; c<=5; c++){
fieldName = c + "xx";
if ( codeFirst == c ) {
map.put(fieldName, map.get(fieldName) + 1);
}
}
} else {
map.put("total", 1);
if ( isStatic ) {
map.put("statics", 1);
map.put("dynamics", 0);
} else {
map.put("statics", 0);
map.put("dynamics", 1);
}
map.put("get", 0);
map.put("post", 0);
map.put("head", 0);
map.put("other", 0);
if (method.equals("GET")) {
map.put("get", 1);
} else if (method.equals("POST")) {
map.put("post", 1);
} else if (method.equals("HEAD")) {
map.put("head", 1);
} else {
map.put("other", 1);
}
for(int c = 2; c<=5; c++){
fieldName = c + "xx";
if ( codeFirst == c ) {
map.put(fieldName, 1);
} else {
map.put(fieldName, 0);
}
}
}
if ( !inWhitelist && ipMapIsExist && mapUrlSize + 1 >= scopeThreshold) {
mapUrlSize = mapUrl.size();
if ( mapUrlSize >= scopeThreshold ) {
ipWhitelist.put(ip, 1);
hashIpUrl.remove(ip);
hashIp.remove(ip);
continue;
}
}
hashIp.put(ip, map);
/*System.out.println("总数:" + total + ",静态:" + statics + ",动态:" + dynamics);
System.out.println("#####################################");*/
}
if ( ( mm - start_mm > 0 && re_time - start_datetime > intervalTime ) || ( mm - start_mm < 0 && re_time - start_datetime > intervalTime ) ) {
System.out.println(line);
System.out.println(topic + " totals -> "+ total +" # ip totals ->" + hashIp.size() + " # whitelist totals ->" + ipWhitelist.size());
String data = "";
ArrayList ipList = new ArrayList();
long totalIp = 0;
long valid = 0;
long scopeSize = 0;
for ( Iterator<String> it = hashIp.keySet().iterator(); it.hasNext(); ) {
String key = (String)it.next();
HashMap<String, Integer> value = (HashMap<String, Integer>) hashIp.get(key);
HashMap<String, Integer> hashUrl = (HashMap<String, Integer>) hashIpUrl.get(key);
totalIp = value.get("total");
scopeSize = hashUrl.size();
if ( totalIp > totalThreshold && scopeSize < scopeThreshold ) {
valid++;
ipList.add("'"+topic+"','"+ key +"','"+start_datetime+"','"+re_time+"','"+value.get("total")+"','"+value.get("statics")+"','"+value.get("dynamics")+"','"+value.get("2xx")+"','"+value.get("3xx")+"','"+value.get("4xx")+"','"+value.get("5xx")+"','"+value.get("get")+"','"+value.get("post")+"','"+value.get("head")+"','"+value.get("other")+"','"+scopeSize+"'");
}
value = null;
hashUrl = null;
}
data = StringUtils.join(ipList.toArray(), "), (");
try {
Class.forName("com.mysql.jdbc.Driver").newInstance();
Connection conn = DriverManager.getConnection(mysqlUrl, mysqlUser, mysqlPassword);
Statement stmt = conn.createStatement();//创建语句对象,用以执行sql语言
if ( valid > 0) {
String sql = "INSERT INTO `ips` (`topic`,`ip` ,`time_start` ,`time_end` ,`total` ,`statics` ,`dynamics` ,`2xx` ,`3xx` ,`4xx` ,`5xx` ,`get` ,`post` ,`head` ,`other`, `scope`) VALUES (" + data + ");";
//System.out.println(sql);
stmt.execute(sql);
data = "";
sql = "";
}
conn.close();
} catch (Exception ex) {
System.out.println("Error : " + ex.toString());
}
total = 0;statics = 0;dynamics = 0;
hashIp = new Hashtable<String, Object>(1000, 0.5F);
hashIpUrl = new Hashtable<String, Object>(1000, 0.5F);
ipWhitelist = new HashMap<String, Integer>();
isStatic = true;
start_datetime = 0;
start_mm = 0;
}
}
}
private final BrokerHosts brokerHosts;
public XlogKafkaSpoutTopology(String kafkaZookeeper) {
brokerHosts = new ZkHosts(kafkaZookeeper);
}
public StormTopology buildTopology(String topic) {
SpoutConfig kafkaConfig = new SpoutConfig(brokerHosts, topic, "", "xlog_storm_"+topic);
kafkaConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("KafkaSpout", new KafkaSpout(kafkaConfig));
builder.setBolt("XlogBolt", new XlogBolt()).shuffleGrouping("KafkaSpout");
return builder.createTopology();
}
public static void main(String[] args) throws Exception {
if ( args == null || args.length != 1 ) {
System.out.println("Usage:storm jar target/storm-xlog-****-jar-with-dependencies.jar storm.xlog.XlogKafkaSpoutTopology configure_file_path");
System.exit(0);
}
File file = new File(args[0]);
if( !file.exists() ) {
System.out.println("configure file " + args[0] + "do not exist!");
System.exit(0);
}
InputStream is = new FileInputStream(file);
Properties prop = new Properties();
prop.load(is);
Config config = new Config();
for (Object key : prop.keySet()) {
config.put((String) key, prop.get(key));
}
is.close();
String kafkaZk = (String) config.get("xlog.zookeeper.server");
String nimbusIp = (String) config.get("xlog.nimbus.host");
String topic = (String) config.get("xlog.kafka.topic.name");
String debug = (String) config.get("xlog.debug");
config.put(Config.TOPOLOGY_DEBUG, debug.toLowerCase().equals("true") ? true : false);
config.put(Config.TOPOLOGY_TRIDENT_BATCH_EMIT_INTERVAL_MILLIS, 50);
XlogKafkaSpoutTopology XlogkafkaSpoutTopology = new XlogKafkaSpoutTopology(kafkaZk);
StormTopology stormTopology = XlogkafkaSpoutTopology.buildTopology(topic);
config.setNumWorkers(1);
config.setMaxTaskParallelism(1);
config.setMaxSpoutPending(10000);
config.put(Config.NIMBUS_HOST, nimbusIp);
config.put(Config.NIMBUS_THRIFT_PORT, 6627);
config.put(Config.STORM_ZOOKEEPER_PORT, 2181);
config.put(Config.STORM_ZOOKEEPER_SERVERS, Arrays.asList(kafkaZk));
StormSubmitter.submitTopology(topic, config, stormTopology);
}
}